/**********************************************************************/
/*
   	Author: Karan Makkar, Michelle Han, Adapted from Nikhil Kumar
	Last Updated: June 2024
   	Description: Cleans raw September 2021 SUSENAS data. Outputs deidentified
	cleaned SUSENAS data merged with PMO data.

	This cleaning file should output 2 different datasets:
	1. Cleaned Susenas data:
	sus_sep21_deid_clean
	2. Subset of Susenas person-batch data matched with PMO for SAKERNAS analysis:
	sus_sep21_deid_clean_merged

*/
/**********************************************************************/


*******************************************
* Setup

* include filepaths 
  	if "$master_run" !="1" include "./Do/SET_FILEPATHS.do"
	cap log close
	global prefix: display %tdCYND td(`c(current_date)')
	log using "$KP_logs/${prefix}_clean_SUSENAS_sep2021.txt", text replace

* Save consumption data in tempfile
	import dbase "$KP_deid_susenas/Raw/Household/dbf/sep21/blok43.dbf", clear

	keep URUT FOOD NONFOOD

	rename URUT urut
	rename FOOD food
	rename NONFOOD nonfood

	tempfile cons
	sa `cons'

* Save HH program vars in tempfile
	import dbase "$KP_deid_susenas/Raw/Household/dbf/sep21/msbp21rt.dbf", clear
	rename *, lower
	collapse m2006a2 m2002 m2003 m2004 m2005a* m1902 m1905, by(urut)
	tempfile hh_social_assistance
	sa `hh_social_assistance', replace

* Load data	
	use "$KP_deid_susenas/Raw/Individual/SUSENAS_SEP21_deid.dta", clear

* Merge Prakerja variables
	fmerge m:1 urut using "$KP_deid_susenas/Raw/Individual/msbp_prakerja_sep21.dta", nogen
	fmerge m:1 urut using `hh_social_assistance', nogen
	fmerge m:1 urut using `cons', nogen

*******************************************
* Demographics

* Gender
	gen gender = m405 == 1
	la def gender 1 "Male" 0 "Female"
	la val gender gender

	gen female = m405 == 2
	la def female 1 "Female" 0 "Male"
	la val female female

* Number of HH members
	gen hh_size_sus = m301

* Marital Status
	gen marital_status = m404

* Age First Married
	gen age_married = m409

* Age
	gen age_sus = m407
	recode age_sus (0/20 = 1) (21/40 = 2) (41/60 = 3) (61/80 = 4) (81/100 = 5), gen(age_cat)

* Year DOB
    gen year_dob_sus = m406c

* Month DOB
    gen month_dob_sus = m406b
	replace month_dob_sus = . if month_dob_sus == 98

* Day DOB	
	gen day_dob_sus = m406a
	replace day_dob_sus = . if day_dob_sus == 98

* Dummy for age <=30 
	gen young = 0 
	replace young = 1 if age_sus <= 30
	la def young 0 "Over 30 Years Old" 1 "30 and Under"
	la val young young 

* Num Children
	gen child = age_sus < 18
	bysort urut: egen num_child = total(child)
	
* Urban/Rural
	gen urban_sus = m105 == 1

* Education (translated to years of schooling)
* Note: coding changed between waves Mar '21 and Sep '21
	gen educ = m1405
	gen school_years = .
	replace school_years = 6 if inlist(educ, 1, 2, 3, 4) // Paket A, SDLB, SD, MI
	replace school_years = 9 if inlist(educ, 5, 6, 7, 8, 9, 10) // Paket B, SMP LB, SMP, MTs
	replace school_years = 12 if inlist(educ, 11, 12, 13, 14, 15, 16, 17) // Paket C, SMLB, SMA, MA, SMK, MAK
	replace school_years = 13.5 if educ == 18 // Diploma I/II
	replace school_years = 15 if educ == 19 // Diploma III
	replace school_years = 16 if inlist(educ, 20, 21) // Diploma IV / SI
	replace school_years = 18 if inlist(educ, 22, 23) // SII/Profesi
	replace school_years = 21 if educ == 24 // SIII
	replace school_years = 3 if educ == 25

* Dummy for educated (graduating high school)
	gen educated = school_years > 12 
	la def educated 0 "Less than 12 Years Schooling" 1 "12 or More Years of Schooling"
	la val educated educated
	
* Literacy Dummy
	gen literate_any = m805 ==1

* Relation to Head of Household
    gen relation_hh_head = m403

* HH Survey Taker Dummy
	gen surveytaker = m410 == m401
*******************************************
* Consumption

* Consumption
* food
	gen cons_food = food/m301
	gen lcon_food = log(cons_food)

	gen cons_adj_food = cons_food
	sum cons_adj_food if cons_adj_food > 0, d
	replace cons_adj_food = r(p99) if cons_adj_food > r(p99) & !mi(cons_adj_food)
	replace cons_adj_food = r(p1) if cons_adj_food < r(p1) & !mi(cons_adj_food)
	replace cons_adj_food = cons_adj_food / 1000
	la var cons_adj_food "Adjusted Food Consumption (in 1000s of rupiah, trimmed of 1st and 99th pct)"
	sum cons_adj_food, d
	gen lcon_adj_food = log(cons_adj_food)

* non-food
	gen cons_nfood = nonfood/m301
	gen lcon_nfood = log(cons_nfood)

	gen cons_adj_nfood = cons_nfood
	sum cons_adj_nfood if cons_adj_nfood > 0, d
	replace cons_adj_nfood = r(p99) if cons_adj_nfood > r(p99) & !mi(cons_adj_nfood)
	replace cons_adj_nfood = r(p1) if cons_adj_nfood < r(p1) & !mi(cons_adj_nfood)
	replace cons_adj_nfood = cons_adj_nfood / 1000
	la var cons_adj_nfood "Adjusted Non-Food Consumption (in 1000s of rupiah, trimmed of 1st and 99th pct)"
	sum cons_adj_nfood, d
	gen lcon_adj_nfood = log(cons_adj_nfood)

* tot consumption
	gen tot_cons = cons_food + cons_nfood
	gen ltot_cons = log(tot_cons)

	gen tot_adj_cons = tot_cons
	sum tot_adj_cons if tot_adj_cons > 0, d
	replace tot_adj_cons = r(p99) if tot_adj_cons > r(p99) & !mi(tot_adj_cons)
	replace tot_adj_cons = r(p1) if tot_adj_cons < r(p1) & !mi(tot_adj_cons)
	replace tot_adj_cons = tot_adj_cons / 1000
	la var tot_adj_cons "Adjusted Total Consumption (in 1000s of rupiah, trimmed of 1st and 99th pct)"	
	sum tot_adj_cons, d
	gen ltot_adj_cons = log(tot_adj_cons)

*******************************************
* Disability

* disabilities
  tab m1102, m
  gen vision_disabled = m1102 == 1 | m1102 == 2 | m1102 == 3
  gen vision_disability = m1102

  tab m1103, m
  gen hearing_disabled = m1103 == 5 | m1103 == 6 | m1103 == 7
  gen hearing_disability = m1103

  tab m1104, m
  gen walk_disabled = m1104 == 1 | m1104 == 2 | m1104 == 3
  gen walk_disability = m1104

  tab m1105, m
  gen hand_disabled = m1105 == 5 | m1105 == 6 | m1105 == 7
  gen hand_disability = m1105

	tab m1106, m
  gen att_disabled = m1106 == 1 | m1106 == 2 | m1106 == 3
  gen att_disability = m1106

	tab m1107, m
	gen emo_disabled = m1107 == 5 | m1107 == 6 | m1107 == 7
	gen emo_disability = m1107

  tab m1108, m
  gen speech_disabled = m1108 == 1 | m1108 == 2 | m1108 == 3
  gen speech_disability = m1108

  summ *disabled

  gen any_disability = (vision_disabled + hearing_disabled + walk_disabled + hand_disabled + speech_disabled + att_disabled + emo_disabled) >= 1
  tab any_disability, m

  gen severe_disability = inlist(m1102, 1, 2) | ///
													inlist(m1103, 5, 6)  | ///
													inlist(m1104, 1, 2)  | ///
													inlist(m1105, 5, 6)  | ///
													inlist(m1106, 1, 2)  | ///
													inlist(m1107, 5, 6)  | ///
													inlist(m1108, 1, 2) 
  tab severe_disability


*******************************************
* Consumption

* Consumption of protein (categorical scale of 1-4)
	tab m605
	gen cons_vegprotein = m606

	tab m606
	gen cons_meatprotein = m606


*******************************************
* Internet Use

	tab m803
	gen use_internet = m803 == 1

	tab m804_b
	gen use_internet_learn = m804_b == "B"

	tab m804_e
	gen use_internet_shop = m804_e == "E"

	tab m804_h
	gen use_internet_bank = m804_h == "H"


*******************************************
* Employed in week prior to survey

* Employed in week prior to survey
	gen employment_status = m1706
	gen self_employed = m1706 == 1
	gen business_owner = m1706 == 2 | m1706 == 3
	gen perm_worker = m1706 == 4
	gen family_unpaid = m1706 == 6
	gen self_emp_bus_owner = (m1706 == 1 | m1706 == 2 | m1706 == 3)

	gen employed = inlist(employment_status, 1, 2, 3, 4, 5)
	label define emp 1 "Self-employed" 2 "Business with temp emp" ///
			3 "Business with perm emp" ///
			4 "Permanent Worker" 5 "Freelancer" 6 "Family/unpaid worker", replace
	lab val employment_status emp
	tab employed employment_status, m row

	tab m1704
	gen temp_not_work = m1704
	recode temp_not_work (1=1) (5=0)
	replace temp_not_work = 0 if employed
	tab temp_not_work employed, m
	tab temp_not_work employment_status, m

* Hours worked
	summ m1707 if employed == 1
	summ m1707 if m1707 ! = 0
	gen hours_worked = m1707
	replace hours_worked = 0 if employed == 0
	
	bysort urut: egen hh_hrs_worked_all = total(hours_worked)

* Household employment
	bys urut: egen hh_business = max(business_owner)
	bys urut: egen hh_emp_self = max(self_employed)

*******************************************
* Savings

	tab m1322_a
	gen savings_bank = m1322_a == "A"

	tab m1322_b
	gen savings_coop = m1322_b == "B"

	tab m1322_c
	gen savings_other = m1322_c == "C"

*******************************************
* Prakerja 

* Applied for Prakerja (anyone in HH)
	tab M2009K1, m
	gen hh_pk_apply = M2009K1 == 1 
	tab hh_pk_apply 
	
* Count of household members that applied for KP
	tab M2009K2, m
	gen hh_applied_count = M2009K2 
	tab hh_applied_count

* Count of household members that won KP
	tab M2009K3, m
	gen hh_selected_count = M2009K3 
	tab hh_selected_count

	gen hh_pk_win = (M2009K3 > 0)
	replace hh_pk_win = 0 if missing(M2009K3)
	tab hh_pk_win

*******************************************
* Other Social Programs

	* BLT-DD
	tab m2006a2
	gen receive_BLTDD = m2006a2 ==1

	* 2204: receive BPNT 
	gen ever_BPNT = m2004 ==1 
	tab ever_BPNT

	* 2205: receive BPNT by month 
	gen aug21_BPNT = m2005a2 ==1
	tab aug21_BPNT

	gen jul21_BPNT = m2005a3 ==1
	tab jul21_BPNT

	gen jun21_BPNT = m2005a4 ==1
	tab jun21_BPNT

	gen may21_BPNT = m2005a5 ==1
	tab may21_BPNT

	* PKH Last Year
	tab m2002
	gen receive_pkh_year = m2002 ==1

	* PKH now
	tab m2003
	gen receive_pkh_now = m2003 ==1

	* PIP 
	tab m1514_a
	gen receive_PIP = m1514_a == "A"

*******************************************
* House Characteristics
	gen house_area = m1902

	gen house_pln_elec = m1905
	gen has_pln_elec = inlist(m1905,1,4) if !mi(m1905)

/*----------------------------------------------------*/
            * Export cleaned data
/*----------------------------------------------------*/

	rename fwt weight
	datasignature
    	if "`r(datasignature)'" == "272088:371(82915):495830884:90232650" {
    	save "$KP_deid_susenas/Clean/sus_sep21_deid_clean.dta", replace
      	}
  	else {
    	di as err "Careful, your machine produces a different dataset"
    	stop
		}
	

/*----------------------------------------------------*/
          * Merge with admin data and export
/*----------------------------------------------------*/

	gen sus_round = 7

	keep if !mi(anon_id4)
	rename m101 r101
	drop m1* m3* m4* m5* m6* m7* m8* m9* date_incentive
	rename r101 m101

* check duplicates
	duplicates tag anon_id4, gen(anon_id4_dupe)
	tab anon_id4_dupe

* drop duplicates with more missing data
	egen missing = rowmiss(*)
	sort anon_id4 missing
	bysort anon_id4: gen n = _n
	drop if n != 1 & !missing(anon_id4)
	drop n

	merge 1:m anon_id4 using "$KP_deid_admin/Clean/pmo_b1-22_clean_long_deid.dta", nogen keep(3)
	
	rename urban urban_pmo 
	la var urban_pmo "Urban (PMO Data, based on city ID)"
	
	rename urban_sus urban 
	la var urban "Urban (SUSENAS survey data)"
	
	* generate variable to indicate date_incentive is within 4 months of survey date 
	gen sus_end_date = date("01sep2020","DMY")
	format sus_end_date %td 

	gen sus_start_date = date("01apr2020","DMY")
	format sus_start_date %td

	gen incentive_near_sus = inrange(date_incentive,sus_start_date,sus_end_date)
	tab incentive_near_sus 

	drop *_dupe missing sus_end_date sus_start_date

	compress
	datasignature
    	if "`r(datasignature)'" == "47201:181(72647):4151805916:1468235139" {
    	save "$KP_deid_susenas/Clean/sus_sep21_deid_clean_merged.dta", replace
      	}
  	else {
    	di as err "Careful, your machine produces a different dataset"
    	stop
		}
	

	// END
